We’re going to explore a large data set or traffic crashes to learn about what factors are connected with injuries. We will use data from the city of Chicago’s open data portal. (This activity is derived from a blog post by Julia Silge)
years_ago <- mdy("01/01/2022") # data from last 2 years. May take time to load!
crash_url <- glue::glue("https://data.cityofchicago.org/Transportation/Traffic-Crashes-Crashes/85ca-t3if?$where=CRASH_DATE >= '{years_ago}'")
crash_raw <- as_tibble(read.socrata(crash_url)) # a new way to read in data, don't worry about it!
This dataset is pretty crazy! Take a look at it in the viewer, and then let’s do some data munging to get it into a nicer form.
-create a variable called injuries which indicates if
the crash involved injuries or not. -create an unknown category for
missing report_types -decide which other variables to
keep
crash <- crash_raw %>%
arrange(desc(crash_date)) %>%
transmute(
injuries = as.factor(if_else(injuries_total > 0, "injuries", "none")),
injuries_total,
crash_date = ymd_hms(crash_date),
posted_speed_limit,
weather_condition = as.factor(weather_condition),
lighting_condition = as.factor(lighting_condition),
first_crash_type,
prim_contributory_cause = as.factor(prim_contributory_cause),
sec_contributory_cause,
most_severe_injury = as.factor(most_severe_injury),
crash_hour, crash_day_of_week, crash_month,
latitude, longitude
)
## Warning: There was 1 warning in `transmute()`.
## ℹ In argument: `crash_date = ymd_hms(crash_date)`.
## Caused by warning:
## ! 747 failed to parse.
Here’s a few questions to get you started.
crash %>%
filter(latitude >0) %>%
drop_na(injuries) %>%
ggplot(aes(x=longitude, y=latitude, color=injuries)) +
geom_point(size = 0.5, alpha = 0.4) +
labs(color = NULL) +
scale_color_manual(values = c("deeppink4", "gray80")) +
coord_fixed()
crash %>%
count(prim_contributory_cause) %>%
arrange(desc(n)) %>%
mutate(prim_contributory_cause = fct_reorder(prim_contributory_cause, n)) %>%
slice(2:4,6:7) %>%
ggplot( aes(y = factor(prim_contributory_cause), x=n)) +
geom_bar(stat="identity") +
labs(title="Primary Causes of Crash, top 5", y="")
p <- crash %>%
mutate(
crash_year = as.factor(year(crash_date)),
month_name = month(crash_month, label = TRUE, abbr = TRUE)
) %>%
drop_na(crash_year) %>%
filter(crash_year == 2022 | crash_year == 2023) %>%
group_by(crash_year, month_name) %>%
summarize(n = n()) %>%
ggplot(aes(x=month_name, y = n, group=crash_year, color=crash_year)) +
geom_line() + geom_point() +
ylim(c(0,NA)) +
labs(title="Crashes increased slightly in 2023",
y="Number of Crashes", x="Month", color="Year")
## `summarise()` has grouped output by 'crash_year'. You can override using the
## `.groups` argument.
p
ggplotly(p)
weather_condition and
lighting_condition to explore.p <- crash %>%
drop_na(injuries) %>%
filter(
weather_condition != "UNKNOWN",
lighting_condition != "UNKNOWN"
) %>%
mutate(
slippery = str_detect(weather_condition, 'RAIN|SNOW'),
dark = str_detect(lighting_condition, 'DARK'),
conditions = as.factor(case_when(
(slippery & dark) ~ "Slippery and Dark Conditions",
(slippery & !dark) ~ "Slippery Only",
(!slippery & dark) ~ "Dark Only",
(!slippery & !dark) ~ "Not Slippery or Dark"
)),
conditions = fct_relevel(
conditions,
"Slippery and Dark Conditions", "Dark Only", "Slippery Only", "Not Slippery or Dark"
)
) %>%
ggplot(aes(y=conditions, fill = injuries)) +
geom_bar(position="dodge")
p
crash %>%
drop_na(injuries) %>%
filter(
weather_condition != "UNKNOWN",
lighting_condition != "UNKNOWN"
) %>%
mutate(
slippery = str_detect(weather_condition, 'RAIN|SNOW'),
dark = str_detect(lighting_condition, 'DARK'),
conditions = as.factor(case_when(
(slippery & dark) ~ "Slippery and Dark Conditions",
(slippery & !dark) ~ "Slippery Only",
(!slippery & dark) ~ "Dark Only",
(!slippery & !dark) ~ "Not Slippery or Dark"
)),
conditions = fct_relevel(
conditions,
"Slippery and Dark Conditions", "Dark Only", "Slippery Only", "Not Slippery or Dark"
)
) %>%
group_by(conditions, injuries) %>%
reframe(n=n()) %>%
group_by(conditions) %>%
reframe(injuries = injuries, freq = n / sum(n)) %>%
ggplot(aes(y=conditions, x=freq, fill = injuries)) +
geom_bar(stat="identity") +
scale_x_continuous(limits = c(0, 1), labels = percent_format()) +
labs(x = "Relative Frequency", y="")
crash %>%
drop_na(injuries) %>%
mutate(
speed_limit = as.factor(if_else(posted_speed_limit <= 25, "Speed Limit <= 25", "Speed limit over 25"))) %>%
group_by(speed_limit, injuries) %>%
reframe(n=n()) %>%
group_by(speed_limit) %>%
reframe(injuries = injuries, freq = n / sum(n)) %>%
ggplot(aes(x=speed_limit, y=freq, fill=injuries)) +
geom_bar(stat="identity") +
scale_y_continuous(limits = c(0, NA), labels = percent_format()) +
labs(
title="Injuries are less likely at speeds 25 mph or lower",
x = "",
y = "Relative Frequency")